/*
*  This file is part of ygg-brute
*  Copyright (c) 2020 ygg-brute authors
*  See LICENSE for licensing information
*/

#pragma once

#include <cstdint>

__device__ __forceinline__ void mul_wide(uint32_t z[16], const uint32_t a[8], const uint32_t b[8])
{
    asm(R"({
        .reg.u32 t0, t1, t2, t3, t4, t5, t6, t7;
        .reg.u16 a0l, a1l, a2l, a3l, a4l, a5l, a6l, a7l;
        .reg.u16 a0h, a1h, a2h, a3h, a4h, a5h, a6h, a7h;
        .reg.u16 b0l, b1l, b2l, b3l, b4l, b5l, b6l, b7l;
        .reg.u16 b0h, b1h, b2h, b3h, b4h, b5h, b6h, b7h;
        mov.b32 { a0l, a0h }, %16;
        mov.b32 { a1l, a1h }, %17;
        mov.b32 { a2l, a2h }, %18;
        mov.b32 { a3l, a3h }, %19;
        mov.b32 { a4l, a4h }, %20;
        mov.b32 { a5l, a5h }, %21;
        mov.b32 { a6l, a6h }, %22;
        mov.b32 { a7l, a7h }, %23;
        mov.b32 { b0l, b0h }, %24;
        mov.b32 { b1l, b1h }, %25;
        mov.b32 { b2l, b2h }, %26;
        mov.b32 { b3l, b3h }, %27;
        mov.b32 { b4l, b4h }, %28;
        mov.b32 { b5l, b5h }, %29;
        mov.b32 { b6l, b6h }, %30;
        mov.b32 { b7l, b7h }, %31;
        mul.wide.u16 %0, a0h, b0l;
        mul.wide.u16 %1, a1h, b0l;
        mul.wide.u16 %2, a2h, b0l;
        mul.wide.u16 %3, a3h, b0l;
        mul.wide.u16 %4, a4h, b0l;
        mul.wide.u16 %5, a5h, b0l;
        mul.wide.u16 %6, a6h, b0l;
        mul.wide.u16 %7, a7h, b0l;
        mul.wide.u16 t0, a0l, b0h;
        mul.wide.u16 t1, a1l, b0h;
        mul.wide.u16 t2, a2l, b0h;
        mul.wide.u16 t3, a3l, b0h;
        mul.wide.u16 t4, a4l, b0h;
        mul.wide.u16 t5, a5l, b0h;
        mul.wide.u16 t6, a6l, b0h;
        mul.wide.u16 t7, a7l, b0h;
        add.cc.u32 %0, %0, t0;
        addc.cc.u32 %1, %1, t1;
        addc.cc.u32 %2, %2, t2;
        addc.cc.u32 %3, %3, t3;
        addc.cc.u32 %4, %4, t4;
        addc.cc.u32 %5, %5, t5;
        addc.cc.u32 %6, %6, t6;
        addc.cc.u32 %7, %7, t7;
        addc.u32 %8, 0, 0;
        mul.wide.u16 t0, a0h, b1l;
        mul.wide.u16 t1, a1h, b1l;
        mul.wide.u16 t2, a2h, b1l;
        mul.wide.u16 t3, a3h, b1l;
        mul.wide.u16 t4, a4h, b1l;
        mul.wide.u16 t5, a5h, b1l;
        mul.wide.u16 t6, a6h, b1l;
        mul.wide.u16 t7, a7h, b1l;
        add.cc.u32 %1, %1, t0;
        addc.cc.u32 %2, %2, t1;
        addc.cc.u32 %3, %3, t2;
        addc.cc.u32 %4, %4, t3;
        addc.cc.u32 %5, %5, t4;
        addc.cc.u32 %6, %6, t5;
        addc.cc.u32 %7, %7, t6;
        addc.cc.u32 %8, %8, t7;
        mul.wide.u16 t0, a0l, b1h;
        mul.wide.u16 t1, a1l, b1h;
        mul.wide.u16 t2, a2l, b1h;
        mul.wide.u16 t3, a3l, b1h;
        mul.wide.u16 t4, a4l, b1h;
        mul.wide.u16 t5, a5l, b1h;
        mul.wide.u16 t6, a6l, b1h;
        mul.wide.u16 t7, a7l, b1h;
        add.cc.u32 %1, %1, t0;
        addc.cc.u32 %2, %2, t1;
        addc.cc.u32 %3, %3, t2;
        addc.cc.u32 %4, %4, t3;
        addc.cc.u32 %5, %5, t4;
        addc.cc.u32 %6, %6, t5;
        addc.cc.u32 %7, %7, t6;
        addc.cc.u32 %8, %8, t7;
        addc.u32 %9, 0, 0;
        mul.wide.u16 t0, a0h, b2l;
        mul.wide.u16 t1, a1h, b2l;
        mul.wide.u16 t2, a2h, b2l;
        mul.wide.u16 t3, a3h, b2l;
        mul.wide.u16 t4, a4h, b2l;
        mul.wide.u16 t5, a5h, b2l;
        mul.wide.u16 t6, a6h, b2l;
        mul.wide.u16 t7, a7h, b2l;
        add.cc.u32 %2, %2, t0;
        addc.cc.u32 %3, %3, t1;
        addc.cc.u32 %4, %4, t2;
        addc.cc.u32 %5, %5, t3;
        addc.cc.u32 %6, %6, t4;
        addc.cc.u32 %7, %7, t5;
        addc.cc.u32 %8, %8, t6;
        addc.cc.u32 %9, %9, t7;
        mul.wide.u16 t0, a0l, b2h;
        mul.wide.u16 t1, a1l, b2h;
        mul.wide.u16 t2, a2l, b2h;
        mul.wide.u16 t3, a3l, b2h;
        mul.wide.u16 t4, a4l, b2h;
        mul.wide.u16 t5, a5l, b2h;
        mul.wide.u16 t6, a6l, b2h;
        mul.wide.u16 t7, a7l, b2h;
        add.cc.u32 %2, %2, t0;
        addc.cc.u32 %3, %3, t1;
        addc.cc.u32 %4, %4, t2;
        addc.cc.u32 %5, %5, t3;
        addc.cc.u32 %6, %6, t4;
        addc.cc.u32 %7, %7, t5;
        addc.cc.u32 %8, %8, t6;
        addc.cc.u32 %9, %9, t7;
        addc.u32 %10, 0, 0;
        mul.wide.u16 t0, a0h, b3l;
        mul.wide.u16 t1, a1h, b3l;
        mul.wide.u16 t2, a2h, b3l;
        mul.wide.u16 t3, a3h, b3l;
        mul.wide.u16 t4, a4h, b3l;
        mul.wide.u16 t5, a5h, b3l;
        mul.wide.u16 t6, a6h, b3l;
        mul.wide.u16 t7, a7h, b3l;
        add.cc.u32 %3, %3, t0;
        addc.cc.u32 %4, %4, t1;
        addc.cc.u32 %5, %5, t2;
        addc.cc.u32 %6, %6, t3;
        addc.cc.u32 %7, %7, t4;
        addc.cc.u32 %8, %8, t5;
        addc.cc.u32 %9, %9, t6;
        addc.cc.u32 %10, %10, t7;
        mul.wide.u16 t0, a0l, b3h;
        mul.wide.u16 t1, a1l, b3h;
        mul.wide.u16 t2, a2l, b3h;
        mul.wide.u16 t3, a3l, b3h;
        mul.wide.u16 t4, a4l, b3h;
        mul.wide.u16 t5, a5l, b3h;
        mul.wide.u16 t6, a6l, b3h;
        mul.wide.u16 t7, a7l, b3h;
        add.cc.u32 %3, %3, t0;
        addc.cc.u32 %4, %4, t1;
        addc.cc.u32 %5, %5, t2;
        addc.cc.u32 %6, %6, t3;
        addc.cc.u32 %7, %7, t4;
        addc.cc.u32 %8, %8, t5;
        addc.cc.u32 %9, %9, t6;
        addc.cc.u32 %10, %10, t7;
        addc.u32 %11, 0, 0;
        mul.wide.u16 t0, a0h, b4l;
        mul.wide.u16 t1, a1h, b4l;
        mul.wide.u16 t2, a2h, b4l;
        mul.wide.u16 t3, a3h, b4l;
        mul.wide.u16 t4, a4h, b4l;
        mul.wide.u16 t5, a5h, b4l;
        mul.wide.u16 t6, a6h, b4l;
        mul.wide.u16 t7, a7h, b4l;
        add.cc.u32 %4, %4, t0;
        addc.cc.u32 %5, %5, t1;
        addc.cc.u32 %6, %6, t2;
        addc.cc.u32 %7, %7, t3;
        addc.cc.u32 %8, %8, t4;
        addc.cc.u32 %9, %9, t5;
        addc.cc.u32 %10, %10, t6;
        addc.cc.u32 %11, %11, t7;
        mul.wide.u16 t0, a0l, b4h;
        mul.wide.u16 t1, a1l, b4h;
        mul.wide.u16 t2, a2l, b4h;
        mul.wide.u16 t3, a3l, b4h;
        mul.wide.u16 t4, a4l, b4h;
        mul.wide.u16 t5, a5l, b4h;
        mul.wide.u16 t6, a6l, b4h;
        mul.wide.u16 t7, a7l, b4h;
        add.cc.u32 %4, %4, t0;
        addc.cc.u32 %5, %5, t1;
        addc.cc.u32 %6, %6, t2;
        addc.cc.u32 %7, %7, t3;
        addc.cc.u32 %8, %8, t4;
        addc.cc.u32 %9, %9, t5;
        addc.cc.u32 %10, %10, t6;
        addc.cc.u32 %11, %11, t7;
        addc.u32 %12, 0, 0;
        mul.wide.u16 t0, a0h, b5l;
        mul.wide.u16 t1, a1h, b5l;
        mul.wide.u16 t2, a2h, b5l;
        mul.wide.u16 t3, a3h, b5l;
        mul.wide.u16 t4, a4h, b5l;
        mul.wide.u16 t5, a5h, b5l;
        mul.wide.u16 t6, a6h, b5l;
        mul.wide.u16 t7, a7h, b5l;
        add.cc.u32 %5, %5, t0;
        addc.cc.u32 %6, %6, t1;
        addc.cc.u32 %7, %7, t2;
        addc.cc.u32 %8, %8, t3;
        addc.cc.u32 %9, %9, t4;
        addc.cc.u32 %10, %10, t5;
        addc.cc.u32 %11, %11, t6;
        addc.cc.u32 %12, %12, t7;
        mul.wide.u16 t0, a0l, b5h;
        mul.wide.u16 t1, a1l, b5h;
        mul.wide.u16 t2, a2l, b5h;
        mul.wide.u16 t3, a3l, b5h;
        mul.wide.u16 t4, a4l, b5h;
        mul.wide.u16 t5, a5l, b5h;
        mul.wide.u16 t6, a6l, b5h;
        mul.wide.u16 t7, a7l, b5h;
        add.cc.u32 %5, %5, t0;
        addc.cc.u32 %6, %6, t1;
        addc.cc.u32 %7, %7, t2;
        addc.cc.u32 %8, %8, t3;
        addc.cc.u32 %9, %9, t4;
        addc.cc.u32 %10, %10, t5;
        addc.cc.u32 %11, %11, t6;
        addc.cc.u32 %12, %12, t7;
        addc.u32 %13, 0, 0;
        mul.wide.u16 t0, a0h, b6l;
        mul.wide.u16 t1, a1h, b6l;
        mul.wide.u16 t2, a2h, b6l;
        mul.wide.u16 t3, a3h, b6l;
        mul.wide.u16 t4, a4h, b6l;
        mul.wide.u16 t5, a5h, b6l;
        mul.wide.u16 t6, a6h, b6l;
        mul.wide.u16 t7, a7h, b6l;
        add.cc.u32 %6, %6, t0;
        addc.cc.u32 %7, %7, t1;
        addc.cc.u32 %8, %8, t2;
        addc.cc.u32 %9, %9, t3;
        addc.cc.u32 %10, %10, t4;
        addc.cc.u32 %11, %11, t5;
        addc.cc.u32 %12, %12, t6;
        addc.cc.u32 %13, %13, t7;
        mul.wide.u16 t0, a0l, b6h;
        mul.wide.u16 t1, a1l, b6h;
        mul.wide.u16 t2, a2l, b6h;
        mul.wide.u16 t3, a3l, b6h;
        mul.wide.u16 t4, a4l, b6h;
        mul.wide.u16 t5, a5l, b6h;
        mul.wide.u16 t6, a6l, b6h;
        mul.wide.u16 t7, a7l, b6h;
        add.cc.u32 %6, %6, t0;
        addc.cc.u32 %7, %7, t1;
        addc.cc.u32 %8, %8, t2;
        addc.cc.u32 %9, %9, t3;
        addc.cc.u32 %10, %10, t4;
        addc.cc.u32 %11, %11, t5;
        addc.cc.u32 %12, %12, t6;
        addc.cc.u32 %13, %13, t7;
        addc.u32 %14, 0, 0;
        mul.wide.u16 t0, a0h, b7l;
        mul.wide.u16 t1, a1h, b7l;
        mul.wide.u16 t2, a2h, b7l;
        mul.wide.u16 t3, a3h, b7l;
        mul.wide.u16 t4, a4h, b7l;
        mul.wide.u16 t5, a5h, b7l;
        mul.wide.u16 t6, a6h, b7l;
        mul.wide.u16 t7, a7h, b7l;
        add.cc.u32 %7, %7, t0;
        addc.cc.u32 %8, %8, t1;
        addc.cc.u32 %9, %9, t2;
        addc.cc.u32 %10, %10, t3;
        addc.cc.u32 %11, %11, t4;
        addc.cc.u32 %12, %12, t5;
        addc.cc.u32 %13, %13, t6;
        addc.cc.u32 %14, %14, t7;
        mul.wide.u16 t0, a0l, b7h;
        mul.wide.u16 t1, a1l, b7h;
        mul.wide.u16 t2, a2l, b7h;
        mul.wide.u16 t3, a3l, b7h;
        mul.wide.u16 t4, a4l, b7h;
        mul.wide.u16 t5, a5l, b7h;
        mul.wide.u16 t6, a6l, b7h;
        mul.wide.u16 t7, a7l, b7h;
        add.cc.u32 %7, %7, t0;
        addc.cc.u32 %8, %8, t1;
        addc.cc.u32 %9, %9, t2;
        addc.cc.u32 %10, %10, t3;
        addc.cc.u32 %11, %11, t4;
        addc.cc.u32 %12, %12, t5;
        addc.cc.u32 %13, %13, t6;
        addc.cc.u32 %14, %14, t7;
        addc.u32 %15, 0, 0;
        shf.l.clamp.b32 %15, %14, %15, 16;
        shf.l.clamp.b32 %14, %13, %14, 16;
        shf.l.clamp.b32 %13, %12, %13, 16;
        shf.l.clamp.b32 %12, %11, %12, 16;
        shf.l.clamp.b32 %11, %10, %11, 16;
        shf.l.clamp.b32 %10, %9, %10, 16;
        shf.l.clamp.b32 %9, %8, %9, 16;
        shf.l.clamp.b32 %8, %7, %8, 16;
        shf.l.clamp.b32 %7, %6, %7, 16;
        shf.l.clamp.b32 %6, %5, %6, 16;
        shf.l.clamp.b32 %5, %4, %5, 16;
        shf.l.clamp.b32 %4, %3, %4, 16;
        shf.l.clamp.b32 %3, %2, %3, 16;
        shf.l.clamp.b32 %2, %1, %2, 16;
        shf.l.clamp.b32 %1, %0, %1, 16;
        shl.b32 %0, %0, 16;
        mul.wide.u16 t0, a0l, b0l;
        mul.wide.u16 t1, a1l, b0l;
        mul.wide.u16 t2, a2l, b0l;
        mul.wide.u16 t3, a3l, b0l;
        mul.wide.u16 t4, a4l, b0l;
        mul.wide.u16 t5, a5l, b0l;
        mul.wide.u16 t6, a6l, b0l;
        mul.wide.u16 t7, a7l, b0l;
        add.cc.u32 %0, %0, t0;
        addc.cc.u32 %1, %1, t1;
        addc.cc.u32 %2, %2, t2;
        addc.cc.u32 %3, %3, t3;
        addc.cc.u32 %4, %4, t4;
        addc.cc.u32 %5, %5, t5;
        addc.cc.u32 %6, %6, t6;
        addc.cc.u32 %7, %7, t7;
        addc.cc.u32 %8, %8, 0;
        addc.cc.u32 %9, %9, 0;
        addc.cc.u32 %10, %10, 0;
        addc.cc.u32 %11, %11, 0;
        addc.cc.u32 %12, %12, 0;
        addc.cc.u32 %13, %13, 0;
        addc.cc.u32 %14, %14, 0;
        addc.u32 %15, %15, 0;
        mul.wide.u16 t0, a0h, b0h;
        mul.wide.u16 t1, a1h, b0h;
        mul.wide.u16 t2, a2h, b0h;
        mul.wide.u16 t3, a3h, b0h;
        mul.wide.u16 t4, a4h, b0h;
        mul.wide.u16 t5, a5h, b0h;
        mul.wide.u16 t6, a6h, b0h;
        mul.wide.u16 t7, a7h, b0h;
        add.cc.u32 %1, %1, t0;
        addc.cc.u32 %2, %2, t1;
        addc.cc.u32 %3, %3, t2;
        addc.cc.u32 %4, %4, t3;
        addc.cc.u32 %5, %5, t4;
        addc.cc.u32 %6, %6, t5;
        addc.cc.u32 %7, %7, t6;
        addc.cc.u32 %8, %8, t7;
        addc.cc.u32 %9, %9, 0;
        addc.cc.u32 %10, %10, 0;
        addc.cc.u32 %11, %11, 0;
        addc.cc.u32 %12, %12, 0;
        addc.cc.u32 %13, %13, 0;
        addc.cc.u32 %14, %14, 0;
        addc.u32 %15, %15, 0;
        mul.wide.u16 t0, a0l, b1l;
        mul.wide.u16 t1, a1l, b1l;
        mul.wide.u16 t2, a2l, b1l;
        mul.wide.u16 t3, a3l, b1l;
        mul.wide.u16 t4, a4l, b1l;
        mul.wide.u16 t5, a5l, b1l;
        mul.wide.u16 t6, a6l, b1l;
        mul.wide.u16 t7, a7l, b1l;
        add.cc.u32 %1, %1, t0;
        addc.cc.u32 %2, %2, t1;
        addc.cc.u32 %3, %3, t2;
        addc.cc.u32 %4, %4, t3;
        addc.cc.u32 %5, %5, t4;
        addc.cc.u32 %6, %6, t5;
        addc.cc.u32 %7, %7, t6;
        addc.cc.u32 %8, %8, t7;
        addc.cc.u32 %9, %9, 0;
        addc.cc.u32 %10, %10, 0;
        addc.cc.u32 %11, %11, 0;
        addc.cc.u32 %12, %12, 0;
        addc.cc.u32 %13, %13, 0;
        addc.cc.u32 %14, %14, 0;
        addc.u32 %15, %15, 0;
        mul.wide.u16 t0, a0h, b1h;
        mul.wide.u16 t1, a1h, b1h;
        mul.wide.u16 t2, a2h, b1h;
        mul.wide.u16 t3, a3h, b1h;
        mul.wide.u16 t4, a4h, b1h;
        mul.wide.u16 t5, a5h, b1h;
        mul.wide.u16 t6, a6h, b1h;
        mul.wide.u16 t7, a7h, b1h;
        add.cc.u32 %2, %2, t0;
        addc.cc.u32 %3, %3, t1;
        addc.cc.u32 %4, %4, t2;
        addc.cc.u32 %5, %5, t3;
        addc.cc.u32 %6, %6, t4;
        addc.cc.u32 %7, %7, t5;
        addc.cc.u32 %8, %8, t6;
        addc.cc.u32 %9, %9, t7;
        addc.cc.u32 %10, %10, 0;
        addc.cc.u32 %11, %11, 0;
        addc.cc.u32 %12, %12, 0;
        addc.cc.u32 %13, %13, 0;
        addc.cc.u32 %14, %14, 0;
        addc.u32 %15, %15, 0;
        mul.wide.u16 t0, a0l, b2l;
        mul.wide.u16 t1, a1l, b2l;
        mul.wide.u16 t2, a2l, b2l;
        mul.wide.u16 t3, a3l, b2l;
        mul.wide.u16 t4, a4l, b2l;
        mul.wide.u16 t5, a5l, b2l;
        mul.wide.u16 t6, a6l, b2l;
        mul.wide.u16 t7, a7l, b2l;
        add.cc.u32 %2, %2, t0;
        addc.cc.u32 %3, %3, t1;
        addc.cc.u32 %4, %4, t2;
        addc.cc.u32 %5, %5, t3;
        addc.cc.u32 %6, %6, t4;
        addc.cc.u32 %7, %7, t5;
        addc.cc.u32 %8, %8, t6;
        addc.cc.u32 %9, %9, t7;
        addc.cc.u32 %10, %10, 0;
        addc.cc.u32 %11, %11, 0;
        addc.cc.u32 %12, %12, 0;
        addc.cc.u32 %13, %13, 0;
        addc.cc.u32 %14, %14, 0;
        addc.u32 %15, %15, 0;
        mul.wide.u16 t0, a0h, b2h;
        mul.wide.u16 t1, a1h, b2h;
        mul.wide.u16 t2, a2h, b2h;
        mul.wide.u16 t3, a3h, b2h;
        mul.wide.u16 t4, a4h, b2h;
        mul.wide.u16 t5, a5h, b2h;
        mul.wide.u16 t6, a6h, b2h;
        mul.wide.u16 t7, a7h, b2h;
        add.cc.u32 %3, %3, t0;
        addc.cc.u32 %4, %4, t1;
        addc.cc.u32 %5, %5, t2;
        addc.cc.u32 %6, %6, t3;
        addc.cc.u32 %7, %7, t4;
        addc.cc.u32 %8, %8, t5;
        addc.cc.u32 %9, %9, t6;
        addc.cc.u32 %10, %10, t7;
        addc.cc.u32 %11, %11, 0;
        addc.cc.u32 %12, %12, 0;
        addc.cc.u32 %13, %13, 0;
        addc.cc.u32 %14, %14, 0;
        addc.u32 %15, %15, 0;
        mul.wide.u16 t0, a0l, b3l;
        mul.wide.u16 t1, a1l, b3l;
        mul.wide.u16 t2, a2l, b3l;
        mul.wide.u16 t3, a3l, b3l;
        mul.wide.u16 t4, a4l, b3l;
        mul.wide.u16 t5, a5l, b3l;
        mul.wide.u16 t6, a6l, b3l;
        mul.wide.u16 t7, a7l, b3l;
        add.cc.u32 %3, %3, t0;
        addc.cc.u32 %4, %4, t1;
        addc.cc.u32 %5, %5, t2;
        addc.cc.u32 %6, %6, t3;
        addc.cc.u32 %7, %7, t4;
        addc.cc.u32 %8, %8, t5;
        addc.cc.u32 %9, %9, t6;
        addc.cc.u32 %10, %10, t7;
        addc.cc.u32 %11, %11, 0;
        addc.cc.u32 %12, %12, 0;
        addc.cc.u32 %13, %13, 0;
        addc.cc.u32 %14, %14, 0;
        addc.u32 %15, %15, 0;
        mul.wide.u16 t0, a0h, b3h;
        mul.wide.u16 t1, a1h, b3h;
        mul.wide.u16 t2, a2h, b3h;
        mul.wide.u16 t3, a3h, b3h;
        mul.wide.u16 t4, a4h, b3h;
        mul.wide.u16 t5, a5h, b3h;
        mul.wide.u16 t6, a6h, b3h;
        mul.wide.u16 t7, a7h, b3h;
        add.cc.u32 %4, %4, t0;
        addc.cc.u32 %5, %5, t1;
        addc.cc.u32 %6, %6, t2;
        addc.cc.u32 %7, %7, t3;
        addc.cc.u32 %8, %8, t4;
        addc.cc.u32 %9, %9, t5;
        addc.cc.u32 %10, %10, t6;
        addc.cc.u32 %11, %11, t7;
        addc.cc.u32 %12, %12, 0;
        addc.cc.u32 %13, %13, 0;
        addc.cc.u32 %14, %14, 0;
        addc.u32 %15, %15, 0;
        mul.wide.u16 t0, a0l, b4l;
        mul.wide.u16 t1, a1l, b4l;
        mul.wide.u16 t2, a2l, b4l;
        mul.wide.u16 t3, a3l, b4l;
        mul.wide.u16 t4, a4l, b4l;
        mul.wide.u16 t5, a5l, b4l;
        mul.wide.u16 t6, a6l, b4l;
        mul.wide.u16 t7, a7l, b4l;
        add.cc.u32 %4, %4, t0;
        addc.cc.u32 %5, %5, t1;
        addc.cc.u32 %6, %6, t2;
        addc.cc.u32 %7, %7, t3;
        addc.cc.u32 %8, %8, t4;
        addc.cc.u32 %9, %9, t5;
        addc.cc.u32 %10, %10, t6;
        addc.cc.u32 %11, %11, t7;
        addc.cc.u32 %12, %12, 0;
        addc.cc.u32 %13, %13, 0;
        addc.cc.u32 %14, %14, 0;
        addc.u32 %15, %15, 0;
        mul.wide.u16 t0, a0h, b4h;
        mul.wide.u16 t1, a1h, b4h;
        mul.wide.u16 t2, a2h, b4h;
        mul.wide.u16 t3, a3h, b4h;
        mul.wide.u16 t4, a4h, b4h;
        mul.wide.u16 t5, a5h, b4h;
        mul.wide.u16 t6, a6h, b4h;
        mul.wide.u16 t7, a7h, b4h;
        add.cc.u32 %5, %5, t0;
        addc.cc.u32 %6, %6, t1;
        addc.cc.u32 %7, %7, t2;
        addc.cc.u32 %8, %8, t3;
        addc.cc.u32 %9, %9, t4;
        addc.cc.u32 %10, %10, t5;
        addc.cc.u32 %11, %11, t6;
        addc.cc.u32 %12, %12, t7;
        addc.cc.u32 %13, %13, 0;
        addc.cc.u32 %14, %14, 0;
        addc.u32 %15, %15, 0;
        mul.wide.u16 t0, a0l, b5l;
        mul.wide.u16 t1, a1l, b5l;
        mul.wide.u16 t2, a2l, b5l;
        mul.wide.u16 t3, a3l, b5l;
        mul.wide.u16 t4, a4l, b5l;
        mul.wide.u16 t5, a5l, b5l;
        mul.wide.u16 t6, a6l, b5l;
        mul.wide.u16 t7, a7l, b5l;
        add.cc.u32 %5, %5, t0;
        addc.cc.u32 %6, %6, t1;
        addc.cc.u32 %7, %7, t2;
        addc.cc.u32 %8, %8, t3;
        addc.cc.u32 %9, %9, t4;
        addc.cc.u32 %10, %10, t5;
        addc.cc.u32 %11, %11, t6;
        addc.cc.u32 %12, %12, t7;
        addc.cc.u32 %13, %13, 0;
        addc.cc.u32 %14, %14, 0;
        addc.u32 %15, %15, 0;
        mul.wide.u16 t0, a0h, b5h;
        mul.wide.u16 t1, a1h, b5h;
        mul.wide.u16 t2, a2h, b5h;
        mul.wide.u16 t3, a3h, b5h;
        mul.wide.u16 t4, a4h, b5h;
        mul.wide.u16 t5, a5h, b5h;
        mul.wide.u16 t6, a6h, b5h;
        mul.wide.u16 t7, a7h, b5h;
        add.cc.u32 %6, %6, t0;
        addc.cc.u32 %7, %7, t1;
        addc.cc.u32 %8, %8, t2;
        addc.cc.u32 %9, %9, t3;
        addc.cc.u32 %10, %10, t4;
        addc.cc.u32 %11, %11, t5;
        addc.cc.u32 %12, %12, t6;
        addc.cc.u32 %13, %13, t7;
        addc.cc.u32 %14, %14, 0;
        addc.u32 %15, %15, 0;
        mul.wide.u16 t0, a0l, b6l;
        mul.wide.u16 t1, a1l, b6l;
        mul.wide.u16 t2, a2l, b6l;
        mul.wide.u16 t3, a3l, b6l;
        mul.wide.u16 t4, a4l, b6l;
        mul.wide.u16 t5, a5l, b6l;
        mul.wide.u16 t6, a6l, b6l;
        mul.wide.u16 t7, a7l, b6l;
        add.cc.u32 %6, %6, t0;
        addc.cc.u32 %7, %7, t1;
        addc.cc.u32 %8, %8, t2;
        addc.cc.u32 %9, %9, t3;
        addc.cc.u32 %10, %10, t4;
        addc.cc.u32 %11, %11, t5;
        addc.cc.u32 %12, %12, t6;
        addc.cc.u32 %13, %13, t7;
        addc.cc.u32 %14, %14, 0;
        addc.u32 %15, %15, 0;
        mul.wide.u16 t0, a0h, b6h;
        mul.wide.u16 t1, a1h, b6h;
        mul.wide.u16 t2, a2h, b6h;
        mul.wide.u16 t3, a3h, b6h;
        mul.wide.u16 t4, a4h, b6h;
        mul.wide.u16 t5, a5h, b6h;
        mul.wide.u16 t6, a6h, b6h;
        mul.wide.u16 t7, a7h, b6h;
        add.cc.u32 %7, %7, t0;
        addc.cc.u32 %8, %8, t1;
        addc.cc.u32 %9, %9, t2;
        addc.cc.u32 %10, %10, t3;
        addc.cc.u32 %11, %11, t4;
        addc.cc.u32 %12, %12, t5;
        addc.cc.u32 %13, %13, t6;
        addc.cc.u32 %14, %14, t7;
        addc.u32 %15, %15, 0;
        mul.wide.u16 t0, a0l, b7l;
        mul.wide.u16 t1, a1l, b7l;
        mul.wide.u16 t2, a2l, b7l;
        mul.wide.u16 t3, a3l, b7l;
        mul.wide.u16 t4, a4l, b7l;
        mul.wide.u16 t5, a5l, b7l;
        mul.wide.u16 t6, a6l, b7l;
        mul.wide.u16 t7, a7l, b7l;
        add.cc.u32 %7, %7, t0;
        addc.cc.u32 %8, %8, t1;
        addc.cc.u32 %9, %9, t2;
        addc.cc.u32 %10, %10, t3;
        addc.cc.u32 %11, %11, t4;
        addc.cc.u32 %12, %12, t5;
        addc.cc.u32 %13, %13, t6;
        addc.cc.u32 %14, %14, t7;
        addc.u32 %15, %15, 0;
        mul.wide.u16 t0, a0h, b7h;
        mul.wide.u16 t1, a1h, b7h;
        mul.wide.u16 t2, a2h, b7h;
        mul.wide.u16 t3, a3h, b7h;
        mul.wide.u16 t4, a4h, b7h;
        mul.wide.u16 t5, a5h, b7h;
        mul.wide.u16 t6, a6h, b7h;
        mul.wide.u16 t7, a7h, b7h;
        add.cc.u32 %8, %8, t0;
        addc.cc.u32 %9, %9, t1;
        addc.cc.u32 %10, %10, t2;
        addc.cc.u32 %11, %11, t3;
        addc.cc.u32 %12, %12, t4;
        addc.cc.u32 %13, %13, t5;
        addc.cc.u32 %14, %14, t6;
        addc.u32 %15, %15, t7;
   })"
   : "=r"(z[0]), "=r"(z[1]), "=r"(z[2]), "=r"(z[3]), "=r"(z[4]), "=r"(z[5]), "=r"(z[6]), "=r"(z[7]),
     "=r"(z[8]), "=r"(z[9]), "=r"(z[10]), "=r"(z[11]), "=r"(z[12]), "=r"(z[13]), "=r"(z[14]), "=r"(z[15])
   : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(a[4]), "r"(a[5]), "r"(a[6]), "r"(a[7]),
     "r"(b[0]), "r"(b[1]), "r"(b[2]), "r"(b[3]), "r"(b[4]), "r"(b[5]), "r"(b[6]), "r"(b[7])
    );
}
